Prepare dataset

library(easypackages)
libraries("here","tidyverse","foreign","lmerTest","psych",
          "ggeasy","emmeans","patchwork","reshape2")

codepath = here("code")
source(file.path(codepath,"utils.R"))
datapath = here("data")
plotpath = here("plots")
resultpath = here("results")

fontSize = 24
cols2use = c("#619cff","#ffa500")

data_fname = file.path(datapath, "MIRA dataset 10-21-23.sav")
tidy_data = prep_dataset(data_fname)

Contingency tables describing data

#  Sex x Treatment Type
table(tidy_data$sex,tidy_data$tx)
##         
##          ASAP LEAP ESDM NDBI NDBI + VF TEACCH EIBI PRT STAR/IMPACT EIBI+PRT DTT
##   Female    9   12   74    6         2     19   31   2           9       16   2
##   Male     49   47  260   27         4    107  213  18          39      102  11
##         
##          STAR ABA
##   Female    0   0
##   Male      5   0
# Responder Status
table(tidy_data$responder15)
## 
## Non-Responder     Responder 
##           102           204
table(tidy_data$responder24)
## 
## Non-Responder     Responder 
##           247           242
# Responder Status x Treatment Type
table(tidy_data$responder15,tidy_data$tx)
##                
##                 ASAP LEAP ESDM NDBI NDBI + VF TEACCH EIBI PRT STAR/IMPACT
##   Non-Responder    5    7   41    5         2     12   12   4           5
##   Responder       15    2   84    7         3     15   30   2          27
##                
##                 EIBI+PRT DTT STAR ABA
##   Non-Responder        6   3    0   0
##   Responder           10   8    1   0
table(tidy_data$responder24,tidy_data$tx)
##                
##                 ASAP LEAP ESDM NDBI NDBI + VF TEACCH EIBI PRT STAR/IMPACT
##   Non-Responder   18   12   80   11         3     31   36   6          21
##   Responder       16    9   92   10         2     23   42   0          23
##                
##                 EIBI+PRT DTT STAR ABA
##   Non-Responder       14  13    2   0
##   Responder           16   6    3   0
# Responder Status x Sex
table(tidy_data$responder15,tidy_data$sex)
##                
##                 Female Male
##   Non-Responder     18   83
##   Responder         38  161
table(tidy_data$responder24,tidy_data$sex)
##                
##                 Female Male
##   Non-Responder     52  191
##   Responder         43  195
# Responder Status x Treatment ESDM vs Other
table(tidy_data$responder15,tidy_data$tx_mvl)
##                
##                 Other ESDM
##   Non-Responder    61   41
##   Responder       120   84
table(tidy_data$responder24,tidy_data$tx_mvl)
##                
##                 Other ESDM
##   Non-Responder   167   80
##   Responder       150   92
# Responder Status x Treatment Type (Broad)
table(tidy_data$responder15,tidy_data$tx_broad)
##                
##                 Other EIBI NDBI ESDM
##   Non-Responder    19   15   27   41
##   Responder        17   38   65   84
table(tidy_data$responder24,tidy_data$tx_broad)
##                
##                 Other EIBI NDBI ESDM
##   Non-Responder    43   49   75   80
##   Responder        32   48   70   92

Preprocess data before modeling

The continuous predictors in the model should be scaled (e.g., z-scored) first.

categorical_predictors = c("tx_broad","sex")

# scale predictors as a pre-processing step for the continuous variables that will be modeled
continuous_predictors = c("age_at_start",
                          "tx_duration",
                          "intensity",
                          "pre_imitation",
                          "pre_vabs_abc",
                          "pre_ados_css",
                          # "pretx_dq",
                          "pretx_verbal_dq",
                          "pretx_nonverbal_dq")

pcavars = c("age_at_start","tx_duration","intensity",
            "pre_imitation",
            "pre_vabs_abc",
            "pre_ados_css",
            "pretx_verbal_dq",
            "pretx_nonverbal_dq")

Acquisition of phrase speech

IVs: sex + tx_broad + age_at_start + intensity + pre_imitation + pre_vabs_abc + pre_ados_css + pretx_verbal_dq + pretx_nonverbal_dq

RFX: (1 | dataset)

responder_var = "responder24"

# Responder Status x Treatment Type (Broad)
tmp_tbl = table(tidy_data[,responder_var],tidy_data$tx_broad)
print(tmp_tbl)
##                
##                 Other EIBI NDBI ESDM
##   Non-Responder    43   49   75   80
##   Responder        32   48   70   92
max2use = max(tmp_tbl)
data2plot = data.frame(t(tmp_tbl))
# cols2use = get_ggColorHue(4)

tx_label2use = "ESDM"
p_esdm = ggplot(data = data2plot[data2plot$Var1==tx_label2use,], aes(x=Var2,y=Freq)) + 
  geom_bar(stat="identity",aes(fill = Var2)) + 
  xlab("Responder Type") + 
  ylab("Sample Size") + ylim(0,max2use) + 
  scale_fill_manual(values=cols2use) + 
  scale_x_discrete(labels = c("Non-Advancer", "Advancer")) +
  theme_bw() + 
  guides(fill="none") +
  ggtitle(tx_label2use) + 
  easy_center_title() +  
  theme(plot.title = element_text(hjust = 0.5, size=fontSize+8),
    strip.text.y = element_text(size=fontSize),
    axis.text.x = element_text(size=fontSize),
    axis.text.y = element_text(size=fontSize-2),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size=fontSize),
    strip.text = element_text(size = fontSize))

tx_label2use = "EIBI"
p_eibi = ggplot(data = data2plot[data2plot$Var1==tx_label2use,], aes(x=Var2,y=Freq)) + 
  geom_bar(stat="identity",aes(fill = Var2)) + 
  xlab("Responder Type") + 
  ylab("Sample Size") + ylim(0,max2use) + 
  scale_fill_manual(values=cols2use) + 
  scale_x_discrete(labels = c("Non-Advancer", "Advancer")) +
  theme_bw() + 
  guides(fill="none") +
  ggtitle(tx_label2use) + 
  easy_center_title() +  
  theme(plot.title = element_text(hjust = 0.5, size=fontSize+8),
    strip.text.y = element_text(size=fontSize),
    axis.text.x = element_text(size=fontSize),
    axis.text.y = element_text(size=fontSize-2),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size=fontSize),
    strip.text = element_text(size = fontSize))

tx_label2use = "NDBI"
p_ndbi = ggplot(data = data2plot[data2plot$Var1==tx_label2use,], aes(x=Var2,y=Freq)) + 
  geom_bar(stat="identity",aes(fill = Var2)) + 
  xlab("Responder Type") + 
  ylab("Sample Size") + ylim(0,max2use) + 
  scale_fill_manual(values=cols2use) + 
  scale_x_discrete(labels = c("Non-Advancer", "Advancer")) +
  theme_bw() + 
  guides(fill="none") +
  ggtitle(tx_label2use) + 
  easy_center_title() +  
  theme(plot.title = element_text(hjust = 0.5, size=fontSize+8),
    strip.text.y = element_text(size=fontSize),
    axis.text.x = element_text(size=fontSize),
    axis.text.y = element_text(size=fontSize-2),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size=fontSize),
    strip.text = element_text(size = fontSize))

tx_label2use = "Other"
p_other = ggplot(data = data2plot[data2plot$Var1==tx_label2use,], aes(x=Var2,y=Freq)) + 
  geom_bar(stat="identity",aes(fill = Var2)) + 
  xlab("Responder Type") + 
  ylab("Sample Size") + ylim(0,max2use) + 
  scale_fill_manual(values=cols2use) + 
  scale_x_discrete(labels = c("Non-Advancer", "Advancer")) +
  theme_bw() + 
  guides(fill="none") +
  ggtitle("Classroom-Based") + 
  easy_center_title() +  
  theme(plot.title = element_text(hjust = 0.5, size=fontSize+8),
    strip.text.y = element_text(size=fontSize),
    axis.text.x = element_text(size=fontSize),
    axis.text.y = element_text(size=fontSize-2),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size=fontSize),
    strip.text = element_text(size = fontSize))

p_final = p_esdm + p_ndbi + p_eibi + p_other + plot_layout(nrow=2,ncol=2)
ggsave(filename = file.path(plotpath, "tx_sample_size_24mo.pdf"), plot = p_final,
       width = 16, height=10)
p_final

# Correlation matrix after using only subjects with complete responder18 labels
mask = rowSums(is.na(tidy_data[,c(responder_var,continuous_predictors)]))==0
data2use = tidy_data %>% filter(mask)

corr_mat = cor(data2use[,continuous_predictors], use="pairwise.complete.obs")
dist = as.dist((1-corr_mat)/2)
hc = hclust(dist)
corr_mat = corr_mat[hc$order, hc$order]
data2plot = melt(corr_mat)
# colnames(data2plot)[3] = "r"
p = ggplot(data = data2plot, aes(x = Var1, y = Var2, fill = value, label = round(value,2))) +
  geom_tile() +
  geom_text() + 
  labs(x = NULL, y = NULL, fill = "r") +
  scale_fill_gradientn(colours = colorRampPalette(c("blue","white","red"))(100), 
                       limits = c(-1,1), 
                       oob = scales::squish) +
  scale_x_discrete(labels = c("Duration", "VABS ABC","Verbal DQ","Nonverbal DQ", 
                              "ADOS CSS", "Imitation","Age at Start","Intensity")) +
  scale_y_discrete(labels = c("Duration", "VABS ABC","Verbal DQ","Nonverbal DQ", 
                              "ADOS CSS", "Imitation","Age at Start","Intensity")) + 
  ggtitle("Predictor Variable Correlations") + 
  easy_center_title() +  
  theme_minimal() +
  theme(axis.text.x = element_text(hjust = 0.95),
        axis.text.y = element_text(hjust = 0.95),
        plot.title = element_text(hjust = 0.5),
        legend.title.align=0.22)

p = p + easy_rotate_x_labels(angle = 90) +
  theme(axis.text.y = element_text(hjust = 0.95), 
        axis.text.x = element_text(hjust = 0.95),
        legend.title.align=0.22)
ggsave(filename = file.path(plotpath, "24mo_corrmat.pdf"), plot = p)
p

# run again with PCA on continuous predictors
form2use = as.formula(sprintf("%s ~ sex + tx_broad + PC1 + PC2 + PC3 + PC4 + PC5 + (1 | dataset)",responder_var))

# fit model
result = logistic_regression(data = tidy_data, 
                             formula = form2use, 
                             vars2scale = continuous_predictors, 
                             responder_var = responder_var,
                             do_pca = TRUE,
                             pcavars = pcavars,
                             recon_pcs = c("PC1","PC2"))
## Generalized linear mixed model fit by maximum likelihood (Adaptive
##   Gauss-Hermite Quadrature, nAGQ = 10) [glmerMod]
##  Family: binomial  ( logit )
## Formula: responder24 ~ sex + tx_broad + PC1 + PC2 + PC3 + PC4 + PC5 +  
##     (1 | dataset)
##    Data: scaled_data2use
##      AIC      BIC   logLik deviance df.resid 
## 181.4995 215.9933 -79.7498 159.4995      159 
## Random effects:
##  Groups  Name        Std.Dev.
##  dataset (Intercept) 0       
## Number of obs: 170, groups:  dataset, 8
## Fixed Effects:
##  (Intercept)       sexMale  tx_broadEIBI  tx_broadNDBI  tx_broadESDM  
##     -0.48192       0.44672       0.63065      -0.86941       0.69720  
##          PC1           PC2           PC3           PC4           PC5  
##     -0.88842       0.82000       0.20767       0.05982       0.31363  
## optimizer (bobyqa) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings 
##                     beta   ci.low95  ci.high85        OR ci.OR.low95
## (Intercept)  -0.48191892 -1.9385590  0.9747212 0.6175971  0.14391117
## sexMale       0.44672117 -0.4826240  1.3760664 1.5631784  0.61716183
## tx_broadEIBI  0.63065316 -2.3939641  3.6552705 1.8788374  0.09126717
## tx_broadNDBI -0.86940788 -2.8842893  1.1454735 0.4191997  0.05589450
## tx_broadESDM  0.69719551 -0.9272554  2.3216464 2.0081131  0.39563809
## PC1          -0.88842397 -1.3130665 -0.4637814 0.4113035  0.26899391
## PC2           0.82000104  0.3383907  1.3016114 2.2705022  1.40268845
## PC3           0.20766558 -0.3003949  0.7157260 1.2308015  0.74052573
## PC4           0.05981975 -0.3747797  0.4944192 1.0616452  0.68744072
## PC5           0.31363182 -0.3214205  0.9486841 1.3683858  0.72511830
##              ci.OR.high95         pval          fdr
## (Intercept)      2.650428 5.166927e-01 0.6458658597
## sexMale          3.959296 3.461208e-01 0.6043629479
## tx_broadEIBI    38.677981 6.827795e-01 0.7586438491
## tx_broadNDBI     3.143930 3.977051e-01 0.6043629479
## tx_broadESDM    10.192442 4.002307e-01 0.6043629479
## PC1              0.628901 4.119892e-05 0.0004119892
## PC2              3.675214 8.464469e-04 0.0042322345
## PC3              2.045671 4.230541e-01 0.6043629479
## PC4              1.639546 7.873287e-01 0.7873286743
## PC5              2.582309 3.330540e-01 0.6043629479
# pairwise comparisons of tx_broad
pairwise_res = emmeans(result$model, pairwise ~ tx_broad)
pairwise_res
## $emmeans
##  tx_broad emmean    SE  df asymp.LCL asymp.UCL
##  Other    -0.176 0.661 Inf    -1.471    1.1197
##  EIBI      0.455 1.442 Inf    -2.372    3.2816
##  NDBI     -1.045 0.519 Inf    -2.062   -0.0285
##  ESDM      0.521 0.459 Inf    -0.377    1.4204
## 
## Results are averaged over the levels of: sex 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast     estimate    SE  df z.ratio p.value
##  Other - EIBI  -0.6307 1.543 Inf  -0.409  0.9770
##  Other - NDBI   0.8694 1.028 Inf   0.846  0.8326
##  Other - ESDM  -0.6972 0.829 Inf  -0.841  0.8348
##  EIBI - NDBI    1.5001 1.555 Inf   0.965  0.7695
##  EIBI - ESDM   -0.0665 1.497 Inf  -0.044  1.0000
##  NDBI - ESDM   -1.5666 0.676 Inf  -2.317  0.0942
## 
## Results are averaged over the levels of: sex 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 4 estimates
summary(result$pca_res)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.8636 1.1845 1.0025 0.9368 0.7359 0.60643 0.46491
## Proportion of Variance 0.4341 0.1754 0.1256 0.1097 0.0677 0.04597 0.02702
## Cumulative Proportion  0.4341 0.6095 0.7351 0.8448 0.9125 0.95849 0.98551
##                            PC8
## Standard deviation     0.34044
## Proportion of Variance 0.01449
## Cumulative Proportion  1.00000
data2plot = result$pca_res$rotation[hc$order,]
data2plot = melt(data2plot)
p = ggplot(data = data2plot, aes(x = Var2, y = Var1, fill = value, label = round(value,2))) +
  geom_tile() +
  geom_text() + 
  labs(x = NULL, y = NULL, fill = "Loading") +
  scale_fill_gradientn(colours = colorRampPalette(c("blue","white","red"))(100), 
                       limits = c(-0.5,0.5), 
                       oob = scales::squish) +
  scale_x_discrete(labels = colnames(result$pca_res$rotation)) +
  scale_y_discrete(labels = rev(c("Intensity","Age at Start","Imitation","ADOS CSS", 
                              "Nonverbal DQ","Verbal DQ","VABS ABC","Duration"))) + 
  ggtitle("PCA Loadings") + 
  easy_center_title() +  
  theme_minimal() +
  theme(axis.text.y = element_text(hjust = 0.95),
        plot.title = element_text(hjust = 0.5))
ggsave(filename = file.path(plotpath, "24mo_pcaloadings.pdf"), plot = p)
p

# plot screeplot
pca_res = prcomp(data2use[,pcavars], scale=TRUE)
data2plot = data.frame(comps = colnames(pca_res$rotation),
           percent_var = ((pca_res$sdev^2) / sum(pca_res$sdev^2)) * 100)
p = ggplot(data = data2plot, aes(x = comps, y = percent_var)) + 
  geom_point() + 
  geom_line(aes(group=1)) + 
  xlab("Components") + ylab("Percentage Variance Explained") + 
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5, size=fontSize),
      strip.text.y = element_text(size=fontSize),
      axis.text.x = element_text(size=fontSize),
      axis.text.y = element_text(size=fontSize-2),
      axis.title.x = element_blank(),
      axis.title.y = element_text(size=fontSize),
      strip.text = element_text(size = fontSize))
ggsave(filename = file.path(plotpath,"24mo_screeplot.pdf"), plot = p)
p

# Plot the PCs by responder status
data2plot = result$data
vars2plot = c("PC1","PC2")
for (var2plot in vars2plot){
  p = ggplot(data = data2plot, aes(x = .data[[responder_var]], 
                                    y = .data[[var2plot]], 
                                    colour = .data[[responder_var]])) + 
    geom_scatterbox() + ggtitle(var2plot) + easy_center_title()
  print(p)
  
  d_res = cohens_d(x = data2plot[data2plot[,responder_var]=="Responder", var2plot], 
           y = data2plot[data2plot[,responder_var]=="Non-Responder", var2plot])
  print(sprintf("Effect size for %s is d = %f", var2plot, d_res))
}

## [1] "Effect size for PC1 is d = -0.929401"

## [1] "Effect size for PC2 is d = 0.653847"
# plot recon data retaining PC1 and PC2
tmp_res = logistic_regression(data = tidy_data, 
                             formula = form2use, 
                             vars2scale = continuous_predictors, 
                             responder_var = responder_var,
                             do_pca = TRUE,
                             pcavars = pcavars,
                             recon_pcs = c("PC1","PC2"),
                             print_output = FALSE)
data2plot = tmp_res$pca_recon_data

vars2plot = c("age_at_start","tx_duration","intensity",
              "pre_imitation","pre_vabs_abc","pre_ados_css",
              "pretx_verbal_dq","pretx_nonverbal_dq")
varnames2use = c("Age at Start","Treatment Duration","Treatment Intensity",
              "Imitation","VABS ABC","ADOS CSS",
              "Verbal DQ","Nonverbal DQ")
i = 0
plots2save = list()
for (var2plot in vars2plot){
  i = i+1
  p = ggplot(data = data2plot, aes(x = .data[[responder_var]], 
                                    y = .data[[var2plot]], 
                                    colour = .data[[responder_var]])) + 
    # geom_scatterbox() + 
    geom_jitter(size=10) +
    geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA) +
    ylab(varnames2use[i]) + 
    scale_colour_manual(values=cols2use) + 
    scale_x_discrete(labels = c("Non-Advancer", "Advancer")) +
    guides(colour="none") +
    ggtitle(varnames2use[i]) + easy_center_title() + 
    easy_remove_x_axis(what = "title") + 
    theme_bw() + 
    theme(plot.title = element_text(hjust = 0.5, size=fontSize+8),
      strip.text.y = element_text(size=fontSize),
      axis.text.x = element_text(size=fontSize),
      axis.text.y = element_text(size=fontSize-2),
      axis.title.x = element_blank(),
      axis.title.y = element_text(size=fontSize),
      strip.text = element_text(size = fontSize))
  ggsave(filename = file.path(plotpath,sprintf("%s_24mo_reconplot.pdf",var2plot)), plot = p)
  plots2save[[i]] = p
  print(p)
  
  d_res = cohens_d(x = data2plot[data2plot[,responder_var]=="Responder", var2plot], 
         y = data2plot[data2plot[,responder_var]=="Non-Responder", var2plot])
  print(sprintf("Effect size for %s is d = %f", var2plot, d_res))
}
## [1] "Effect size for age_at_start is d = -0.489102"

## [1] "Effect size for tx_duration is d = 0.504642"

## [1] "Effect size for intensity is d = -0.354117"

## [1] "Effect size for pre_imitation is d = 0.725119"

## [1] "Effect size for pre_vabs_abc is d = 1.124856"

## [1] "Effect size for pre_ados_css is d = -1.092412"

## [1] "Effect size for pretx_verbal_dq is d = 1.135789"

## [1] "Effect size for pretx_nonverbal_dq is d = 0.934539"
p_final = plots2save[[1]] + plots2save[[3]] + plots2save[[6]] + plots2save[[4]] + plots2save[[5]] + plots2save[[8]] + plots2save[[7]] + plots2save[[2]] + plot_layout(nrow = 2, ncol = 4)
ggsave(filename = file.path(plotpath,"24mo_reconplot.pdf"), plot = p_final, 
       width = 24, height=15)
p_final

Acquisition of single words

IVs: sex + tx_broad + age_at_start + intensity + pre_imitation + pre_vabs_abc + pre_ados_css + pretx_verbal_dq + pretx_nonverbal_dq

RFX: (1 | dataset)

responder_var = "responder15"

# Responder Status x Treatment Type (Broad)
tmp_tbl = table(tidy_data[,responder_var],tidy_data$tx_broad)
print(tmp_tbl)
##                
##                 Other EIBI NDBI ESDM
##   Non-Responder    19   15   27   41
##   Responder        17   38   65   84
max2use = max(tmp_tbl)
data2plot = data.frame(t(tmp_tbl))
# cols2use = get_ggColorHue(4)

tx_label2use = "ESDM"
p_esdm = ggplot(data = data2plot[data2plot$Var1==tx_label2use,], aes(x=Var2,y=Freq)) + 
  geom_bar(stat="identity",aes(fill = Var2)) + 
  xlab("Responder Type") + 
  ylab("Sample Size") + ylim(0,max2use) + 
  scale_fill_manual(values=cols2use) + 
  scale_x_discrete(labels = c("Non-Advancer", "Advancer")) +
  theme_bw() + 
  guides(fill="none") +
  ggtitle(tx_label2use) + 
  easy_center_title() +  
  theme(plot.title = element_text(hjust = 0.5, size=fontSize+8),
    strip.text.y = element_text(size=fontSize),
    axis.text.x = element_text(size=fontSize),
    axis.text.y = element_text(size=fontSize-2),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size=fontSize),
    strip.text = element_text(size = fontSize))

tx_label2use = "EIBI"
p_eibi = ggplot(data = data2plot[data2plot$Var1==tx_label2use,], aes(x=Var2,y=Freq)) + 
  geom_bar(stat="identity",aes(fill = Var2)) + 
  xlab("Responder Type") + 
  ylab("Sample Size") + ylim(0,max2use) + 
  scale_fill_manual(values=cols2use) + 
  scale_x_discrete(labels = c("Non-Advancer", "Advancer")) +
  theme_bw() + 
  guides(fill="none") +
  ggtitle(tx_label2use) + 
  easy_center_title() +  
  theme(plot.title = element_text(hjust = 0.5, size=fontSize+8),
    strip.text.y = element_text(size=fontSize),
    axis.text.x = element_text(size=fontSize),
    axis.text.y = element_text(size=fontSize-2),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size=fontSize),
    strip.text = element_text(size = fontSize))

tx_label2use = "NDBI"
p_ndbi = ggplot(data = data2plot[data2plot$Var1==tx_label2use,], aes(x=Var2,y=Freq)) + 
  geom_bar(stat="identity",aes(fill = Var2)) + 
  xlab("Responder Type") + 
  ylab("Sample Size") + ylim(0,max2use) + 
  scale_fill_manual(values=cols2use) + 
  scale_x_discrete(labels = c("Non-Advancer", "Advancer")) +
  theme_bw() + 
  guides(fill="none") +
  ggtitle(tx_label2use) + 
  easy_center_title() +  
  theme(plot.title = element_text(hjust = 0.5, size=fontSize+8),
    strip.text.y = element_text(size=fontSize),
    axis.text.x = element_text(size=fontSize),
    axis.text.y = element_text(size=fontSize-2),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size=fontSize),
    strip.text = element_text(size = fontSize))

tx_label2use = "Other"
p_other = ggplot(data = data2plot[data2plot$Var1==tx_label2use,], aes(x=Var2,y=Freq)) + 
  geom_bar(stat="identity",aes(fill = Var2)) + 
  xlab("Responder Type") + 
  ylab("Sample Size") + ylim(0,max2use) + 
  scale_fill_manual(values=cols2use) + 
  scale_x_discrete(labels = c("Non-Advancer", "Advancer")) +
  theme_bw() + 
  guides(fill="none") +
  ggtitle("Classroom-Based") + 
  easy_center_title() +  
  theme(plot.title = element_text(hjust = 0.5, size=fontSize+8),
    strip.text.y = element_text(size=fontSize),
    axis.text.x = element_text(size=fontSize),
    axis.text.y = element_text(size=fontSize-2),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size=fontSize),
    strip.text = element_text(size = fontSize))

p_final = p_esdm + p_ndbi + p_eibi + p_other + plot_layout(nrow=2,ncol=2)
ggsave(filename = file.path(plotpath, "tx_sample_size_15mo.pdf"), plot = p_final,
       width = 16, height = 10)
p_final

# Correlation matrix after using only subjects with complete responder18 labels
mask = rowSums(is.na(tidy_data[,c(responder_var,continuous_predictors)]))==0
data2use = tidy_data %>% filter(mask)

corr_mat = cor(data2use[,continuous_predictors], use="pairwise.complete.obs")
dist = as.dist((1-corr_mat)/2)
hc = hclust(dist)
corr_mat = corr_mat[hc$order, hc$order]
data2plot = melt(corr_mat)
# colnames(data2plot)[3] = "r"
p = ggplot(data = data2plot, aes(x = Var1, y = Var2, fill = value, label = round(value,2))) +
  geom_tile() +
  geom_text() + 
  labs(x = NULL, y = NULL, fill = "r") +
  scale_fill_gradientn(colours = colorRampPalette(c("blue","white","red"))(100), 
                       limits = c(-1,1), 
                       oob = scales::squish) +
  scale_x_discrete(labels = c("Imitation", "Duration","VABS ABC","Verbal DQ", 
                              "Nonverbal DQ", "ADOS CSS","Age at Start","Intensity")) +
  scale_y_discrete(labels = c("Imitation", "Duration","VABS ABC","Verbal DQ", 
                              "Nonverbal DQ", "ADOS CSS","Age at Start","Intensity")) +
  ggtitle("Predictor Variable Correlations") + 
  easy_center_title() +  
  theme_minimal() +
  theme(axis.text.x = element_text(hjust = 0.95),
        axis.text.y = element_text(hjust = 0.95), 
        plot.title = element_text(hjust = 0.5),
        legend.title.align=0.22)

p = p + easy_rotate_x_labels(angle = 90) +
  theme(axis.text.y = element_text(hjust = 0.95), 
        axis.text.x = element_text(hjust = 0.95),
        legend.title.align=0.22)
ggsave(filename = file.path(plotpath, "15mo_corrmat.pdf"), plot = p)
p

# run again with PCA on continuous predictors
form2use = as.formula(sprintf("%s ~ sex + tx_broad + PC1 + PC2 + PC3 + PC4 + PC5 + (1 | dataset)",responder_var))

# fit model
result = logistic_regression(data = tidy_data, 
                             formula = form2use, 
                             vars2scale = continuous_predictors, 
                             responder_var = responder_var,
                             do_pca = TRUE,
                             pcavars = pcavars,
                             recon_pcs = c("PC1","PC2"))
## Generalized linear mixed model fit by maximum likelihood (Adaptive
##   Gauss-Hermite Quadrature, nAGQ = 10) [glmerMod]
##  Family: binomial  ( logit )
## Formula: responder15 ~ sex + tx_broad + PC1 + PC2 + PC3 + PC4 + PC5 +  
##     (1 | dataset)
##    Data: scaled_data2use
##      AIC      BIC   logLik deviance df.resid 
##  98.2549 128.1584 -38.1275  76.2549      101 
## Random effects:
##  Groups  Name        Std.Dev.
##  dataset (Intercept) 0       
## Number of obs: 112, groups:  dataset, 8
## Fixed Effects:
##  (Intercept)       sexMale  tx_broadEIBI  tx_broadNDBI  tx_broadESDM  
##      -4.0086        0.7012        6.1246        5.8779        5.1198  
##          PC1           PC2           PC3           PC4           PC5  
##      -0.3347        1.9257        0.6276        1.3699        1.1460  
## optimizer (bobyqa) convergence code: 0 (OK) ; 0 optimizer warnings; 1 lme4 warnings 
##                    beta    ci.low95  ci.high85           OR  ci.OR.low95
## (Intercept)  -4.0086424 -7.26115953 -0.7561253   0.01815803 0.0007022932
## sexMale       0.7012270 -0.77939627  2.1818503   2.01622514 0.4586828501
## tx_broadEIBI  6.1245863  1.37623285 10.8729397 456.95561043 3.9599557325
## tx_broadNDBI  5.8778524  1.81200071  9.9437040 357.04162755 6.1226848980
## tx_broadESDM  5.1198333  1.68649802  8.5531685 167.30747427 5.4005349918
## PC1          -0.3347430 -0.88043618  0.2109501   0.71552193 0.4146020330
## PC2           1.9257325  1.01139271  2.8400723   6.86017199 2.7494275095
## PC3           0.6275639 -0.22669614  1.4818240   1.87304214 0.7971629744
## PC4           1.3698956  0.44245971  2.2973314   3.93493972 1.5565311286
## PC5           1.1460226 -0.03379945  2.3258446   3.14565636 0.9667653672
##              ci.OR.high95         pval          fdr
## (Intercept)      0.469482 1.570720e-02 0.0261786702
## sexMale          8.862690 3.532722e-01 0.3532721608
## tx_broadEIBI 52729.990942 1.146902e-02 0.0229380315
## tx_broadNDBI 20820.722596 4.604131e-03 0.0115103274
## tx_broadESDM  5183.151482 3.469249e-03 0.0115103274
## PC1              1.234851 2.292406e-01 0.2547117578
## PC2             17.117003 3.658607e-05 0.0003658607
## PC3              4.400966 1.499036e-01 0.1873794585
## PC4              9.947601 3.790693e-03 0.0115103274
## PC5             10.235321 5.692971e-02 0.0813281626
# pairwise comparisons of tx_broad
pairwise_res = emmeans(result$model, pairwise ~ tx_broad)
pairwise_res
## $emmeans
##  tx_broad emmean    SE  df asymp.LCL asymp.UCL
##  Other     -3.56 1.502 Inf    -6.500    -0.612
##  EIBI       2.57 1.745 Inf    -0.852     5.990
##  NDBI       2.32 0.797 Inf     0.761     3.883
##  ESDM       1.56 0.651 Inf     0.287     2.841
## 
## Results are averaged over the levels of: sex 
## Results are given on the logit (not the response) scale. 
## Confidence level used: 0.95 
## 
## $contrasts
##  contrast     estimate    SE  df z.ratio p.value
##  Other - EIBI   -6.125 2.423 Inf  -2.528  0.0557
##  Other - NDBI   -5.878 2.074 Inf  -2.833  0.0238
##  Other - ESDM   -5.120 1.752 Inf  -2.923  0.0182
##  EIBI - NDBI     0.247 1.763 Inf   0.140  0.9990
##  EIBI - ESDM     1.005 1.804 Inf   0.557  0.9447
##  NDBI - ESDM     0.758 0.982 Inf   0.772  0.8672
## 
## Results are averaged over the levels of: sex 
## Results are given on the log odds ratio (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 4 estimates
summary(result$pca_res)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.8949 1.1039 1.0383 0.9388 0.70242 0.65228 0.44322
## Proportion of Variance 0.4489 0.1523 0.1348 0.1102 0.06167 0.05318 0.02456
## Cumulative Proportion  0.4489 0.6012 0.7359 0.8461 0.90778 0.96097 0.98552
##                            PC8
## Standard deviation     0.34034
## Proportion of Variance 0.01448
## Cumulative Proportion  1.00000
data2plot = result$pca_res$rotation[hc$order,]
data2plot = melt(data2plot)
p = ggplot(data = data2plot, aes(x = Var2, y = Var1, fill = value, label = round(value,2))) +
  geom_tile() +
  geom_text() + 
  labs(x = NULL, y = NULL, fill = "Loading") +
  scale_fill_gradientn(colours = colorRampPalette(c("blue","white","red"))(100), 
                       limits = c(-0.5,0.5), 
                       oob = scales::squish) +
  scale_x_discrete(labels = colnames(result$pca_res$rotation)) +
  scale_y_discrete(labels = rev(c("Intensity","Age at Start","ADOS CSS","Nonverbal DQ",
                              "Verbal DQ","VABS ABC","Duration","Imitation"))) + 
  ggtitle("PCA Loadings") + 
  easy_center_title() +  
  theme_minimal() +
  theme(axis.text.y = element_text(hjust = 0.95),
        plot.title = element_text(hjust = 0.5))
ggsave(filename = file.path(plotpath, "15mo_pcaloadings.pdf"), plot = p)
p

# plot screeplot
pca_res = prcomp(data2use[,pcavars], scale=TRUE)
data2plot = data.frame(comps = colnames(pca_res$rotation),
           percent_var = ((pca_res$sdev^2) / sum(pca_res$sdev^2)) * 100)
p = ggplot(data = data2plot, aes(x = comps, y = percent_var)) + 
  geom_point() + 
  geom_line(aes(group=1)) + 
  xlab("Components") + ylab("Percentage Variance Explained") + 
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5, size=fontSize),
      strip.text.y = element_text(size=fontSize),
      axis.text.x = element_text(size=fontSize),
      axis.text.y = element_text(size=fontSize-2),
      axis.title.x = element_blank(),
      axis.title.y = element_text(size=fontSize),
      strip.text = element_text(size = fontSize))
ggsave(filename = file.path(plotpath,"15mo_screeplot.pdf"), plot = p)
p

# Plot the PCs by responder status
data2plot = result$data
vars2plot = c("PC2","PC4")
for (var2plot in vars2plot){
  p = ggplot(data = data2plot, aes(x = .data[[responder_var]], 
                                    y = .data[[var2plot]], 
                                    colour = .data[[responder_var]])) + 
    geom_scatterbox() + ggtitle(var2plot) + easy_center_title()
  print(p)
  
  d_res = cohens_d(x = data2plot[data2plot[,responder_var]=="Responder", var2plot], 
           y = data2plot[data2plot[,responder_var]=="Non-Responder", var2plot])
  print(sprintf("Effect size for %s is d = %f", var2plot, d_res))
}

## [1] "Effect size for PC2 is d = 0.766249"

## [1] "Effect size for PC4 is d = 0.298752"
# plot recon data retaining PC2 and PC4
tmp_res = logistic_regression(data = tidy_data, 
                             formula = form2use, 
                             vars2scale = continuous_predictors, 
                             responder_var = responder_var,
                             do_pca = TRUE,
                             pcavars = pcavars,
                             recon_pcs = c("PC2","PC4"),
                             print_output = FALSE)
data2plot = tmp_res$pca_recon_data


vars2plot = c("age_at_start","tx_duration","intensity",
              "pre_imitation","pre_vabs_abc","pre_ados_css",
              "pretx_verbal_dq","pretx_nonverbal_dq")
varnames2use = c("Age at Start","Treatment Duration","Treatment Intensity",
              "Imitation","VABS ABC","ADOS CSS",
              "Verbal DQ","Nonverbal DQ")
i = 0
plots2save = list()
for (var2plot in vars2plot){
  i = i+1
  p = ggplot(data = data2plot, aes(x = .data[[responder_var]], 
                                    y = .data[[var2plot]], 
                                    colour = .data[[responder_var]])) + 
    # geom_scatterbox() + 
    geom_jitter(size=10) +
    geom_boxplot(fill = NA, colour = "#000000", outlier.shape = NA) +
    ylab(varnames2use[i]) + 
    scale_colour_manual(values=cols2use) + 
    scale_x_discrete(labels = c("Non-Advancer", "Advancer")) +
    guides(colour="none") +
    ggtitle(varnames2use[i]) + easy_center_title() + 
    easy_remove_x_axis(what = "title") + 
    theme_bw() + 
    theme(plot.title = element_text(hjust = 0.5, size=fontSize+8),
      strip.text.y = element_text(size=fontSize),
      axis.text.x = element_text(size=fontSize),
      axis.text.y = element_text(size=fontSize-2),
      axis.title.x = element_blank(),
      axis.title.y = element_text(size=fontSize),
      strip.text = element_text(size = fontSize))
  ggsave(filename = file.path(plotpath,sprintf("%s_15mo_reconplot.pdf",var2plot)), plot = p)
  plots2save[[i]] = p
  print(p)
  
  d_res = cohens_d(x = data2plot[data2plot[,responder_var]=="Responder", var2plot], 
         y = data2plot[data2plot[,responder_var]=="Non-Responder", var2plot])
  print(sprintf("Effect size for %s is d = %f", var2plot, d_res))
}
## [1] "Effect size for age_at_start is d = 0.756136"

## [1] "Effect size for tx_duration is d = 0.565083"

## [1] "Effect size for intensity is d = 0.789705"

## [1] "Effect size for pre_imitation is d = 0.794774"

## [1] "Effect size for pre_vabs_abc is d = 0.207615"

## [1] "Effect size for pre_ados_css is d = -0.356252"

## [1] "Effect size for pretx_verbal_dq is d = 0.400548"

## [1] "Effect size for pretx_nonverbal_dq is d = -0.532218"
p_final = plots2save[[1]] + plots2save[[3]] + plots2save[[6]] + plots2save[[4]] + plots2save[[5]] + plots2save[[8]] + plots2save[[7]] + plots2save[[2]] + plot_layout(nrow = 2, ncol = 4)
ggsave(filename = file.path(plotpath,"15mo_reconplot.pdf"), plot = p_final, 
       width = 24, height=15)
p_final